
clear all
set maxvar 30000
set more off

global dir /Volumes/Zihao_SSD2/PatentsView

*** ===================================================================================================
*** Generate regression panel for main analysis
*** Zihao Li. 06/2024
*** ===================================================================================================

*** Append g_patent_clean_final.csv with the variable "num_citations" 
import delimited $dir/temp/actual_citation_lst.csv, clear
keep patent_id num_citations
save $dir/temp/actual_citation_lst.dta, replace

* Import comprehensive patent-level dataset (1976-2022). Output of gen_var_gender.py
import delimited $dir/cleandata/g_patent_clean_final.csv, clear

merge 1:1 patent_id using $dir/temp/actual_citation_lst.dta // 6,716,818
drop if _merge==2
drop _merge
format %30s assignee_organization assignee_city 
sort patent_id
drop if assignee_organization == ""
drop if patent_year < 1981 | patent_year > 2015 // 4370221


*** ===================================================================================================
*** Append lead-inventor rank (gen_leadinventor_rank.py)
merge 1:1 patent_id using $dir/temp/leadinventor_rank.dta // 4,368,475
drop if _merge != 3
drop _merge


*** ===================================================================================================
*** Append lead-inventor centrality and number of previous number of coauthors
merge 1:1 patent_id using $dir/temp/leadinventor_info.dta // 4,368,475
drop if _merge !=3
drop _merge
rename (deg_centrality cumulative_coauthors) (lead_centrality lead_cum_coauthors)

*** ===================================================================================================
*** Append patent's commercial value (quality) from Kogan et al. (2017)
*** github.com/KPSS2017/Technological-Innovation-Resource-Allocation-and-Growth-Extended-Data (2020 version)
preserve
import delimited $dir/rawdata/KPSS_2020_public.csv, clear
keep patent_num xi_real xi_nominal
rename patent_num patent_id
save $dir/temp/KPSS_2020_public.dta, replace
restore

merge 1:1 patent_id using $dir/temp/KPSS_2020_public.dta // 1,812,803
drop if _merge!=3
drop _merge


*** ===================================================================================================
*** Append assignee rank (gen_dictionary_firm.py)
merge m:1 assignee_id using $dir/temp/firmdict.dta // 1,812,803
drop if _merge==2
drop _merge

* Generate firm_tier variables
gen firm_tier = 1 if assignee_rank <=10
replace firm_tier = 2 if (assignee_rank > 10 & assignee_rank <= 50)
replace firm_tier = 3 if (assignee_rank > 50 & assignee_rank <= 200)
replace firm_tier = 4 if (assignee_rank > 200)


*** ===================================================================================================
* Generate female-patent share by patent class
bys main_cpc_section: egen fshare_cpcsection = mean(allfemale_09_100==1)
bys main_cpc_class: egen fshare_cpcclass = mean(allfemale_09_100==1)
bys main_cpc_subclass: egen fshare_cpcsubclass = mean(allfemale_09_100==1)

bys main_cpc_section: egen flshare_cpcsection = mean(leadfemale_09_100==1)
bys main_cpc_class: egen flshare_cpcclass = mean(leadfemale_09_100==1)
bys main_cpc_subclass: egen flshare_cpcsubclass = mean(leadfemale_09_100==1)

bys main_cpc_section: egen feshare_cpcsection = mean(existfemale_09_100==1)
bys main_cpc_class: egen feshare_cpcclass = mean(existfemale_09_100==1)
bys main_cpc_subclass: egen feshare_cpcsubclass = mean(existfemale_09_100==1)

* Generate female inventor share by patent class 
bys main_cpc_section: egen total_inventors_f_cpcsection = sum(num_inventors_f)
bys main_cpc_section: egen total_inventors_m_cpcsection = sum(num_inventors_m)
bys main_cpc_section: egen total_inventors_cpcsection = sum(num_inventors)
bys main_cpc_class: egen total_inventors_f_cpcclass = sum(num_inventors_f)
bys main_cpc_class: egen total_inventors_m_cpcclass = sum(num_inventors_m)
bys main_cpc_class: egen total_inventors_cpcclass = sum(num_inventors)
bys main_cpc_subclass: egen total_inventors_f_cpcsubclass = sum(num_inventors_f)
bys main_cpc_subclass: egen total_inventors_m_cpcsubclass = sum(num_inventors_m)
bys main_cpc_subclass: egen total_inventors_cpcsubclass = sum(num_inventors)

gen finvshare_cpcsection = total_inventors_f_cpcsection / total_inventors_cpcsection
gen finvshare_cpcclass = total_inventors_f_cpcclass / total_inventors_cpcclass
gen finvshare_cpcsubclass = total_inventors_f_cpcsubclass / total_inventors_cpcsubclass
gen minvshare_cpcsection = total_inventors_m_cpcsection / total_inventors_cpcsection
gen minvshare_cpcclass = total_inventors_m_cpcclass / total_inventors_cpcclass
gen minvshare_cpcsubclass = total_inventors_m_cpcsubclass / total_inventors_cpcsubclass

*** ===================================================================================================
*** Save two identical copies. One for citing patent (i) and one for cited patent (j).
sort patent_id
save $dir/temp/patent.dta, replace

preserve
ds
local varlist = r(varlist)
foreach var of local varlist {
    rename `var' `var'_i
}
save $dir/temp/patent_i.dta, replace
restore

preserve
ds
local varlist = r(varlist)
foreach var of local varlist {
    rename `var' `var'_j
}
save $dir/temp/patent_j.dta, replace
restore


* EXAMINER omission panel
preserve
import delimited $dir/temp/omission_panel5_examiner.csv, clear
keep patent_id cited_patent_id omission sim_score
rename (omission patent_id cited_patent_id) (omission_examiner patent_id_i patent_id_j)
save $dir/temp/omission_panel5_examiner.dta, replace
restore


* OVERALL omission panel
import delimited $dir/temp/omission_panel5.csv, clear
keep patent_id cited_patent_id omission sim_score
rename (patent_id cited_patent_id) (patent_id_i patent_id_j)
merge 1:1 patent_id_i patent_id_j sim_score using $dir/temp/omission_panel5_examiner.dta
drop if _merge!=3
drop _merge


*** ===================================================================================================
*** Merge with citing patents (i) and cited patents (j).
merge m:1 patent_id_i using $dir/temp/patent_i.dta // 8,731,650
drop if _merge!=3
drop _merge
merge m:1 patent_id_j using $dir/temp/patent_j.dta // 4,588,277
drop if _merge!=3
drop _merge
sort patent_id_i patent_id_j


*** ===================================================================================================
*** Append SHORTEST-PATH variable 
preserve
import delimited $dir/cleandata/shortest_path_sum_1981_2015.csv, clear
rename (patent_id_x patent_year_x patent_id_y) (patent_id_i patent_year_i patent_id_j)
save $dir/temp/shortest_path_sum_1981_2015.dta, replace
restore

merge 1:1 patent_id_i patent_year_i patent_id_j using $dir/temp/shortest_path_sum_1981_2015.dta // 4,588,275
drop if _merge!=3 // 2 unmatched from master
drop _merge


*** ===================================================================================================
*** Append attorney information
preserve
import delimited $dir/cleandata/g_attorney_clean.csv, clear
format %20s a_gender_09_100_list a_gender_09_50_list a_gender_08_100_list a_gender_08_50_list
save $dir/cleandata/g_attorney_clean.dta, replace
restore

* 1,831,114 matched. 2,764,316 Master, 1,551,045 Using
* 1,693,687 ; 2,746,663 ; 1,570,623 
merge m:1 patent_id_i using $dir/cleandata/g_attorney_clean.dta // 1,737,660
drop if _merge==2
drop _merge


*** ===================================================================================================
*** Append examiner information (patent_id_i is string in g_examiner_gender_race_temp.dta)
tostring patent_id_i, replace
merge m:1 patent_id_i using $dir/temp/g_examiner_gender_temp.dta // 4,584,900
drop if _merge!=3
drop _merge
destring patent_id_i, replace


*** ===================================================================================================
*** Generate regression variables
* same main_cpc_subclass
gen same_main_cpc = 1 if main_cpc_subclass_i == main_cpc_subclass_j
replace same_main_cpc = 0 if same_main_cpc != 1

* same assignee_country
gen same_assignee_country = 1 if assignee_country_i == assignee_country_j & assignee_country_i != "" & assignee_country_j != ""
replace same_assignee_country = 0 if same_assignee_country != 1

* same assignee_location
gen same_assignee_location = 1 if assignee_location_id_i == assignee_location_id_j & assignee_location_id_i != "" & assignee_location_id_j != ""
replace same_assignee_location = 0 if same_assignee_location != 1

* same assignee
gen same_assignee = 1 if assignee_id_i == assignee_id_j
replace same_assignee = 0 if same_assignee != 1

* years lag between citing patent and cited patent
gen years_lag = patent_year_i - patent_year_j

* KPSS commercial value (quality) variable (convert to dollar)
gen xi_dollar_real_i = xi_real_i * 1000000
gen xi_dollar_nominal_i = xi_nominal_i * 1000000
gen xi_dollar_real_j = xi_real_j* 1000000
gen xi_dollar_nominal_j = xi_nominal_j * 1000000
gen dollar_real_log_i = log(xi_dollar_real_i)
gen dollar_real_log_j = log(xi_dollar_real_j)
gen dollar_nominal_log_i = log(xi_dollar_nominal_i)
gen dollar_nominal_log_j = log(xi_dollar_nominal_j)


* Combine black, hispanic, and asian into "minority" and create variable exist_minority
gen exist_minority80_i = (strpos(race80_list_i,"asian")>0) | (strpos(race80_list_i,"hispanic")>0) | (strpos(race80_list_i,"nh_black")>0)
gen exist_minority80_j = (strpos(race80_list_j,"asian")>0) | (strpos(race80_list_j,"hispanic")>0) | (strpos(race80_list_j,"nh_black")>0)

gen noambig_race80_i = strpos(race80_list_i,"ambiguous")==0
gen noambig_race80_j = strpos(race80_list_j,"ambiguous")==0

drop gender_09_100_list* gender_io_09_100_list* gender_09_50_list* gender_08_100_list* gender_08_50_list* gender_io_05_100_list* gender_io_06_100_list* gender_io_07_100_list* gender_io_08_100_list* race80_list*
drop if sim_score == 1

* Generate gender interaction variables
gen allfemale_09_100_ji = allfemale_09_100_i * allfemale_09_100_j
gen leadfemale_09_100_ji = leadfemale_09_100_i * leadfemale_09_100_j
gen allfemale_leadfemale_09_100_ji = allfemale_09_100_j * leadfemale_09_100_i
gen leadfemale_allfemale_09_100_ji = leadfemale_09_100_j * allfemale_09_100_i

gen existfemale_09_100_ji = existfemale_09_100_i * existfemale_09_100_j
gen allfemale_existfemale_09_100_ji = allfemale_09_100_j * existfemale_09_100_i
gen existfemale_allfemale_09_100_ji = existfemale_09_100_j * allfemale_09_100_i

* Generate gender-(interaction) variables for examiners
gen efemale_09_100 = 1 if e_gender_09_100 == "female"
replace efemale_09_100 = 0 if e_gender_09_100 == "male" | e_gender_09_100 == "ambiguous"
gen efemale_allfemale_09_100_j = efemale_09_100 * allfemale_09_100_j
gen efemale_allfemale_09_100_i = efemale_09_100 * allfemale_09_100_i
gen efemale_allfemale_09_100_ji = efemale_09_100 * allfemale_09_100_i * allfemale_09_100_j
gen efemale_leadfemale_09_100_j = efemale_09_100 * leadfemale_09_100_j
gen examiner_allinter = allfemale_09_100_j * allfemale_09_100_i * efemale_09_100

* Generate gender-(interaction) variables for attorneys
gen a_allfemale_allfemale_09_100_j = a_allfemale_09_100 * allfemale_09_100_j
gen a_allfemale_allfemale_09_100_i = a_allfemale_09_100 * allfemale_09_100_i
gen a_allfemale_allfemale_09_100_ji = a_allfemale_09_100 * allfemale_09_100_i * allfemale_09_100_j
gen a_allfemale_leadfemale_09_100_j = a_allfemale_09_100 * leadfemale_09_100_j
gen attorney_allinter = allfemale_09_100_j * allfemale_09_100_i * a_allfemale_09_100

* Generate race variables
replace lead_race80_j = "minority" if lead_race80_j == "hispanic" | lead_race80_j == "nh_black"
gen exist_minority80_ji = exist_minority80_j * exist_minority80_i

* Generate ordered shortest_path variable
replace node_dist_all = 999 if node_dist_all == .
replace node_dist_first = 999 if node_dist_first == .

* Based on all inventors
gen binned_node_dist_all = 1 if node_dist_all >= 6
replace binned_node_dist_all = 2 if node_dist_all == 5
replace binned_node_dist_all = 3 if node_dist_all == 4
replace binned_node_dist_all = 4 if node_dist_all == 3
replace binned_node_dist_all = 5 if node_dist_all == 2
replace binned_node_dist_all = 6 if node_dist_all == 1
replace binned_node_dist_all = 7 if node_dist_all == 0

* Based on first inventor
gen binned_node_dist_first = 1 if node_dist_first >= 6
replace binned_node_dist_first = 2 if node_dist_first == 5
replace binned_node_dist_first = 3 if node_dist_first == 4
replace binned_node_dist_first = 4 if node_dist_first == 3
replace binned_node_dist_first = 5 if node_dist_first == 2
replace binned_node_dist_first = 6 if node_dist_first == 1
replace binned_node_dist_first = 7 if node_dist_first == 0


*** Export final dataset for regression
export delimited using $dir/regdata/reg_panel.csv, replace




*** ===================================================================================================
*** This part saves information about patent family (Younge and Kuhn)
*** Save two versions. One for citing patent (i) and one for cited patent (j).
import delimited $dir/rawdata/Younge-Kuhn_Patent_Families_2017-09-25.csv, clear
rename patent_number patent_id
preserve
ds
local varlist = r(varlist)
foreach var of local varlist {
    rename `var' `var'_i
}
save $dir/temp/patent_family_i.dta, replace
restore

preserve
ds
local varlist = r(varlist)
foreach var of local varlist {
    rename `var' `var'_j
}
save $dir/temp/patent_family_j.dta, replace
restore



*** ===================================================================================================
*** This part is for "Effect of omission on future productivity"
*** Used to generate Table 10, B16
*** ===================================================================================================
use $dir/cleandata/g_inventor_gender_race_age.dta, clear
keep patent_id patent_year inventor_name inventor_id gender_09_100 race80

* Drop non-numeric patents
gen temp_non_numeric = regexm(patent_id, "[^0-9]")
drop if temp_non_numeric == 1
destring patent_id, replace
drop temp_non_numeric
drop if patent_year < 1981 | patent_year > 2015
drop patent_year

merge m:1 patent_id using $dir/temp/patent.dta
drop if _merge != 3
drop _merge

gen xi_dollar_real = xi_real * 1000000
gen xi_dollar_nominal = xi_nominal * 1000000
gen dollar_real_log = log(xi_dollar_real)
gen dollar_nominal_log = log(xi_dollar_nominal)

* Aggregate data from patent-inventor-level to inventor-year-level
sort inventor_id patent_year
bysort inventor_id patent_year: gen num_patents = _N
bysort inventor_id patent_year: egen avg_dollar_real_log = mean(dollar_real_log)
drop dollar_real_log dollar_nominal_log xi_real xi_nominal xi_dollar_real xi_dollar_nominal

* Main CPC section of inventor
bys inventor_id main_cpc_section: gen patent_count = _N
by inventor_id: egen max_patent_count = max(patent_count) // max over all CPC sections
by inventor_id: gen rep_cpcsection = main_cpc_section if patent_count == max_patent_count
gsort inventor_id -rep_cpcsection
replace rep_cpcsection = rep_cpcsection[_n-1] if rep_cpcsection=="" & inventor_id==inventor_id[_n-1]
drop patent_count max_patent_count

* Main CPC class of inventor
bys inventor_id main_cpc_class: gen patent_count = _N
by inventor_id: egen max_patent_count = max(patent_count) // max over all CPC classes
by inventor_id: gen rep_cpcclass = main_cpc_class if patent_count == max_patent_count
gsort inventor_id -rep_cpcclass
replace rep_cpcclass = rep_cpcclass[_n-1] if rep_cpcclass=="" & inventor_id==inventor_id[_n-1]
drop patent_count max_patent_count

* Main CPC subclass of inventor
bys inventor_id main_cpc_subclass: gen patent_count = _N
by inventor_id: egen max_patent_count = max(patent_count) // max over all CPC subclasses
by inventor_id: gen rep_cpcsubclass = main_cpc_subclass if patent_count == max_patent_count
gsort inventor_id -rep_cpcsubclass
replace rep_cpcsubclass = rep_cpcsubclass[_n-1] if rep_cpcsubclass=="" & inventor_id==inventor_id[_n-1]
drop patent_count max_patent_count

* Inventors with equal number of patents in >1 CPC
by inventor_id: gen assertion_failed_subclass = (rep_cpcsubclass != rep_cpcsubclass[1])
by inventor_id: egen assertion_failed_subclass_ = max(assertion_failed_subclass)
gsort inventor_id -patent_year
replace rep_cpcsubclass = rep_cpcsubclass[_n-1] if assertion_failed_subclass_==1 & inventor_id==inventor_id[_n-1]
drop assertion_failed_*
by inventor_id: gen assertion_failed_class = (rep_cpcclass != rep_cpcclass[1])
by inventor_id: egen assertion_failed_class_ = max(assertion_failed_class)
replace rep_cpcclass = rep_cpcclass[_n-1] if assertion_failed_class_==1 & inventor_id==inventor_id[_n-1]
drop assertion_failed_*
by inventor_id: gen assertion_failed_section = (rep_cpcsection != rep_cpcsection[1])
by inventor_id: egen assertion_failed_section_ = max(assertion_failed_section)
replace rep_cpcsection = rep_cpcsection[_n-1] if assertion_failed_section_==1 & inventor_id==inventor_id[_n-1]
drop assertion_failed_*

* Sanity checks before collapsing to inventor-year-level
by inventor_id: assert rep_cpcsection == rep_cpcsection[1]
by inventor_id: assert rep_cpcclass == rep_cpcclass[1]
by inventor_id: assert rep_cpcsubclass == rep_cpcsubclass[1]
sort inventor_id patent_year
by inventor_id patent_year: assert gender_09_100 == gender_09_100[1]
by inventor_id patent_year: assert race80 == race80[1]
by inventor_id patent_year: assert num_patents == num_patents[1]
by inventor_id patent_year: assert avg_dollar_real_log == avg_dollar_real_log[1]

duplicates drop inventor_id patent_year, force
rename patent_year year
keep inventor_id year gender_09_100 race80 rep* avg_dollar_real_log num_patents

* Save dataset
save $dir/temp/g_inventor_gender_race_age_short.dta, replace
